OLIST E-Commerce Public Dataset

Data Cleaning

Imports

In [1]:
import pandas as pd
from IPython.display import display
import seaborn as sns
import datetime as dt
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import numpy as np

Creating DataFrame

In [2]:
customer = pd.read_csv('olist_customers_dataset.csv')
geolocation = pd.read_csv('olist_geolocation_dataset.csv')
orders = pd.read_csv('olist_orders_dataset.csv')
order_items = pd.read_csv('olist_order_items_dataset.csv')
order_payments = pd.read_csv('olist_order_payments_dataset.csv')
order_reviews = pd.read_csv('olist_order_reviews_dataset.csv')
products = pd.read_csv('olist_products_dataset.csv')
sellers = pd.read_csv('olist_sellers_dataset.csv')
english_desc = pd.read_csv('product_category_name_translation.csv')

Understanding Our Data Values

In [3]:
#Taking all data frames in a list, so that we can perform specific operations on all of them by looping
df_list = [customer,geolocation,orders,order_items,order_payments,order_reviews,products,sellers]
In [4]:
#Looping using the display function imported above, because Jupyter doesnt print in tabular formats(true for every print)
for i in df_list:
    display(i.head(5))
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state
0 06b8999e2fba1a1fbc88172c00ba8bc7 861eff4711a542e4b93843c6dd7febb0 14409 franca SP
1 18955e83d337fd6b2def6b18a428ac77 290c77bc529b7ac935b93aa66c333dc3 9790 sao bernardo do campo SP
2 4e7b3e00288586ebd08712fdd0374a03 060e732b5b29e8181a18229c7b0b2b5e 1151 sao paulo SP
3 b2b6027bc5c5109e529d4dc6358b12c3 259dac757896d24d7702b9acbbff3f3c 8775 mogi das cruzes SP
4 4f2d8ab171c80ec8364f7c12e35b23ad 345ecd01c38d18a9036ed96c73b8d066 13056 campinas SP
geolocation_zip_code_prefix geolocation_lat geolocation_lng geolocation_city geolocation_state
0 1037 -23.545621 -46.639292 sao paulo SP
1 1046 -23.546081 -46.644820 sao paulo SP
2 1046 -23.546129 -46.642951 sao paulo SP
3 1041 -23.544392 -46.639499 sao paulo SP
4 1035 -23.541578 -46.641607 sao paulo SP
order_id customer_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date
0 e481f51cbdc54678b7cc49136f2d6af7 9ef432eb6251297304e76186b10a928d delivered 2017-10-02 10:56:33 2017-10-02 11:07:15 2017-10-04 19:55:00 2017-10-10 21:25:13 2017-10-18 00:00:00
1 53cdb2fc8bc7dce0b6741e2150273451 b0830fb4747a6c6d20dea0b8c802d7ef delivered 2018-07-24 20:41:37 2018-07-26 03:24:27 2018-07-26 14:31:00 2018-08-07 15:27:45 2018-08-13 00:00:00
2 47770eb9100c2d0c44946d9cf07ec65d 41ce2a54c0b03bf3443c3d931a367089 delivered 2018-08-08 08:38:49 2018-08-08 08:55:23 2018-08-08 13:50:00 2018-08-17 18:06:29 2018-09-04 00:00:00
3 949d5b44dbf5de918fe9c16f97b45f8a f88197465ea7920adcdbec7375364d82 delivered 2017-11-18 19:28:06 2017-11-18 19:45:59 2017-11-22 13:39:59 2017-12-02 00:28:42 2017-12-15 00:00:00
4 ad21c59c0840e6cb83a9ceb5573f8159 8ab97904e6daea8866dbdbc4fb7aad2c delivered 2018-02-13 21:18:39 2018-02-13 22:20:29 2018-02-14 19:46:34 2018-02-16 18:17:02 2018-02-26 00:00:00
order_id order_item_id product_id seller_id shipping_limit_date price freight_value
0 00010242fe8c5a6d1ba2dd792cb16214 1 4244733e06e7ecb4970a6e2683c13e61 48436dade18ac8b2bce089ec2a041202 2017-09-19 09:45:35 58.90 13.29
1 00018f77f2f0320c557190d7a144bdd3 1 e5f2d52b802189ee658865ca93d83a8f dd7ddc04e1b6c2c614352b383efe2d36 2017-05-03 11:05:13 239.90 19.93
2 000229ec398224ef6ca0657da4fc703e 1 c777355d18b72b67abbeef9df44fd0fd 5b51032eddd242adc84c38acab88f23d 2018-01-18 14:48:30 199.00 17.87
3 00024acbcdf0a6daa1e931b038114c75 1 7634da152a4610f1595efa32f14722fc 9d7a1d34a5052409006425275ba1c2b4 2018-08-15 10:10:18 12.99 12.79
4 00042b26cf59d7ce69dfabb4e55b4fd9 1 ac6c3623068f30de03045865e4e10089 df560393f3a51e74553ab94004ba5c87 2017-02-13 13:57:51 199.90 18.14
order_id payment_sequential payment_type payment_installments payment_value
0 b81ef226f3fe1789b1e8b2acac839d17 1 credit_card 8 99.33
1 a9810da82917af2d9aefd1278f1dcfa0 1 credit_card 1 24.39
2 25e8ea4e93396b6fa0d3dd708e76c1bd 1 credit_card 1 65.71
3 ba78997921bbcdc1373bb41e913ab953 1 credit_card 8 107.78
4 42fdf880ba16b47b59251dd489d4441a 1 credit_card 2 128.45
review_id order_id review_score review_comment_title review_comment_message review_creation_date review_answer_timestamp
0 7bc2406110b926393aa56f80a40eba40 73fc7af87114b39712e6da79b0a377eb 4 NaN NaN 2018-01-18 00:00:00 2018-01-18 21:46:59
1 80e641a11e56f04c1ad469d5645fdfde a548910a1c6147796b98fdf73dbeba33 5 NaN NaN 2018-03-10 00:00:00 2018-03-11 03:05:13
2 228ce5500dc1d8e020d8d1322874b6f0 f9e4b658b201a9f2ecdecbb34bed034b 5 NaN NaN 2018-02-17 00:00:00 2018-02-18 14:36:24
3 e64fb393e7b32834bb789ff8bb30750e 658677c97b385a9be170737859d3511b 5 NaN Recebi bem antes do prazo estipulado. 2017-04-21 00:00:00 2017-04-21 22:02:06
4 f7c4243c7fe1938f181bec41a392bdeb 8e6bfb81e283fa7e4f11123a3fb894f1 5 NaN Parabéns lojas lannister adorei comprar pela I... 2018-03-01 00:00:00 2018-03-02 10:26:53
product_id product_category_name product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm product_height_cm product_width_cm
0 1e9e8ef04dbcff4541ed26657ea517e5 perfumaria 40.0 287.0 1.0 225.0 16.0 10.0 14.0
1 3aa071139cb16b67ca9e5dea641aaa2f artes 44.0 276.0 1.0 1000.0 30.0 18.0 20.0
2 96bd76ec8810374ed1b65e291975717f esporte_lazer 46.0 250.0 1.0 154.0 18.0 9.0 15.0
3 cef67bcfe19066a932b7673e239eb23d bebes 27.0 261.0 1.0 371.0 26.0 4.0 26.0
4 9dc1a7de274444849c219cff195d0b71 utilidades_domesticas 37.0 402.0 4.0 625.0 20.0 17.0 13.0
seller_id seller_zip_code_prefix seller_city seller_state
0 3442f8959a84dea7ee197c632cb2df15 13023 campinas SP
1 d1b65fc7debc3361ea86b5f14c68d2e2 13844 mogi guacu SP
2 ce3ad9de960102d0677a81f5d0bb7b2d 20031 rio de janeiro RJ
3 c0f3eea2e14555b6faeea3dd58c1b1c3 4195 sao paulo SP
4 51a04a8a6bdcb23deccc82b0b80742cf 12914 braganca paulista SP

Finding Attributes of each table with loops

In [5]:
#Finding Attributes of each table with loops
df_names = ['customer','geolocation','orders','order_items','order_payments','order_reviews','products','sellers']
df_info_table = pd.DataFrame({}) # Creating an Empty Dataframe
df_info_table['Dataframe'] = df_names # Creating a Column with Dataframe(WIll work as in index)
df_info_table['Rows'] = [df.shape[0] for df in df_list] #List Comprehension
df_info_table['Columns'] = [df.shape[1] for df in df_list]
df_info_table['Null Value Count'] = [df.isnull().sum().sum() for df in df_list]
df_info_table['Null Columns'] = [len([col for col, null in df.isnull().sum().items() if null > 0]) for df in df_list]
df_info_table['Null Columns Name'] = [', '.join([col for col, null in df.isnull().sum().items() if null > 0]) for df in df_list]


#Chosing a Colour Paletter with Seaborn Library
cm = sns.light_palette("maroon", as_cmap=True)

#Mapping the style to the dataframe
df_info_table.style.background_gradient(cmap=cm)
Out[5]:
Dataframe Rows Columns Null Value Count Null Columns Null Columns Name
0 customer 99441 5 0 0
1 geolocation 1000163 5 0 0
2 orders 99441 8 4908 3 order_approved_at, order_delivered_carrier_date, order_delivered_customer_date
3 order_items 112650 7 0 0
4 order_payments 103886 5 0 0
5 order_reviews 100000 7 146532 2 review_comment_title, review_comment_message
6 products 32951 9 2448 8 product_category_name, product_name_lenght, product_description_lenght, product_photos_qty, product_weight_g, product_length_cm, product_height_cm, product_width_cm
7 sellers 3095 4 0 0

Assessing The Columns of Dataframes

Assessing Cutomer Table

In [6]:
customer.dtypes
Out[6]:
customer_id                 object
customer_unique_id          object
customer_zip_code_prefix     int64
customer_city               object
customer_state              object
dtype: object
In [7]:
#What is Customer Unique ID and Customer ID? We can Drop One of the Two Columns
len(customer.customer_unique_id.unique()),len(customer.customer_id.unique())
Out[7]:
(96096, 99441)
In [8]:
#Dropping "customer_unique_id"
customer.drop(['customer_unique_id'],axis=1).head(1) #Axis = 1 drops the column
Out[8]:
customer_id customer_zip_code_prefix customer_city customer_state
0 06b8999e2fba1a1fbc88172c00ba8bc7 14409 franca SP

Assessing "Orders" Table

In [9]:
orders.dtypes
Out[9]:
order_id                         object
customer_id                      object
order_status                     object
order_purchase_timestamp         object
order_approved_at                object
order_delivered_carrier_date     object
order_delivered_customer_date    object
order_estimated_delivery_date    object
dtype: object
In [10]:
orders.head(1)
Out[10]:
order_id customer_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date
0 e481f51cbdc54678b7cc49136f2d6af7 9ef432eb6251297304e76186b10a928d delivered 2017-10-02 10:56:33 2017-10-02 11:07:15 2017-10-04 19:55:00 2017-10-10 21:25:13 2017-10-18 00:00:00
In [11]:
orders.order_status.unique()
Out[11]:
array(['delivered', 'invoiced', 'shipped', 'processing', 'unavailable',
       'canceled', 'created', 'approved'], dtype=object)
In [12]:
#We see that all the date and time stamps have a time associated with it we wont need the time for this project, hence we remove them
orders.columns
Out[12]:
Index(['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date'],
      dtype='object')
In [13]:
orders['order_purchase_date']= pd.to_datetime(orders['order_purchase_timestamp']).dt.date
orders['order_purchase_time']= pd.to_datetime(orders['order_purchase_timestamp']).dt.time
orders['order_delivered_time'] = pd.to_datetime(orders['order_delivered_customer_date']).dt.time
orders['order_delivered_date'] = pd.to_datetime(orders['order_delivered_customer_date']).dt.date
orders['order_estimated_delivery_time'] = pd.to_datetime(orders['order_estimated_delivery_date']).dt.time
orders['order_estimated_delivery_date'] = pd.to_datetime(orders['order_estimated_delivery_date']).dt.date
orders['order_approved_at'] = pd.to_datetime(orders['order_approved_at'])
In [14]:
orders['order_delivered_date'] = orders['order_delivered_date'].apply(pd.to_datetime)
orders['order_purchase_date'] = orders['order_purchase_date'].apply(pd.to_datetime)
In [15]:
orders.head(1)
Out[15]:
order_id customer_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date order_purchase_date order_purchase_time order_delivered_time order_delivered_date order_estimated_delivery_time
0 e481f51cbdc54678b7cc49136f2d6af7 9ef432eb6251297304e76186b10a928d delivered 2017-10-02 10:56:33 2017-10-02 11:07:15 2017-10-04 19:55:00 2017-10-10 21:25:13 2017-10-18 2017-10-02 10:56:33 21:25:13 2017-10-10 00:00:00
In [16]:
orders.columns
Out[16]:
Index(['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date',
       'order_purchase_date', 'order_purchase_time', 'order_delivered_time',
       'order_delivered_date', 'order_estimated_delivery_time'],
      dtype='object')
In [17]:
#order approve vs purchase
In [18]:
orders=orders.drop(columns=['order_purchase_timestamp',
       'order_approved_at',
       'order_delivered_customer_date','order_purchase_time', 'order_delivered_time','order_estimated_delivery_time','order_delivered_carrier_date'])
In [19]:
orders.head()
Out[19]:
order_id customer_id order_status order_estimated_delivery_date order_purchase_date order_delivered_date
0 e481f51cbdc54678b7cc49136f2d6af7 9ef432eb6251297304e76186b10a928d delivered 2017-10-18 2017-10-02 2017-10-10
1 53cdb2fc8bc7dce0b6741e2150273451 b0830fb4747a6c6d20dea0b8c802d7ef delivered 2018-08-13 2018-07-24 2018-08-07
2 47770eb9100c2d0c44946d9cf07ec65d 41ce2a54c0b03bf3443c3d931a367089 delivered 2018-09-04 2018-08-08 2018-08-17
3 949d5b44dbf5de918fe9c16f97b45f8a f88197465ea7920adcdbec7375364d82 delivered 2017-12-15 2017-11-18 2017-12-02
4 ad21c59c0840e6cb83a9ceb5573f8159 8ab97904e6daea8866dbdbc4fb7aad2c delivered 2018-02-26 2018-02-13 2018-02-16

Assessing "Order Items" Dataframe

In [20]:
order_items.dtypes
Out[20]:
order_id                object
order_item_id            int64
product_id              object
seller_id               object
shipping_limit_date     object
price                  float64
freight_value          float64
dtype: object
In [21]:
order_items.columns
Out[21]:
Index(['order_id', 'order_item_id', 'product_id', 'seller_id',
       'shipping_limit_date', 'price', 'freight_value'],
      dtype='object')
In [22]:
order_items = order_items.drop(['shipping_limit_date'],axis=1)
In [23]:
order_items.head(1)
Out[23]:
order_id order_item_id product_id seller_id price freight_value
0 00010242fe8c5a6d1ba2dd792cb16214 1 4244733e06e7ecb4970a6e2683c13e61 48436dade18ac8b2bce089ec2a041202 58.9 13.29

Assessing "Order Reviews" Dataframe

In [24]:
order_reviews.dtypes
Out[24]:
review_id                  object
order_id                   object
review_score                int64
review_comment_title       object
review_comment_message     object
review_creation_date       object
review_answer_timestamp    object
dtype: object
In [25]:
#We only need the Review Score for this Project, Hence We Drop rest of the columns
order_reviews = order_reviews.drop(columns=['review_comment_title','review_comment_message','review_creation_date','review_answer_timestamp'],axis=1)
In [26]:
order_reviews.head(1)
Out[26]:
review_id order_id review_score
0 7bc2406110b926393aa56f80a40eba40 73fc7af87114b39712e6da79b0a377eb 4

Assessing "Products" Dataframe

In [27]:
products.dtypes
Out[27]:
product_id                     object
product_category_name          object
product_name_lenght           float64
product_description_lenght    float64
product_photos_qty            float64
product_weight_g              float64
product_length_cm             float64
product_height_cm             float64
product_width_cm              float64
dtype: object
In [28]:
products.columns
Out[28]:
Index(['product_id', 'product_category_name', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm'],
      dtype='object')
In [29]:
products.head(1)
Out[29]:
product_id product_category_name product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm product_height_cm product_width_cm
0 1e9e8ef04dbcff4541ed26657ea517e5 perfumaria 40.0 287.0 1.0 225.0 16.0 10.0 14.0

Assessing "Sellers" Dataframe

In [30]:
sellers.dtypes
Out[30]:
seller_id                 object
seller_zip_code_prefix     int64
seller_city               object
seller_state              object
dtype: object
In [31]:
#####################################
In [32]:
#Exporting

customer.to_csv(r'C:\Users\Suyash\BALC Project\Cleaned\customer.csv')
geolocation.to_csv(r'C:\Users\Suyash\BALC Project\Cleaned\geolocation.csv')
orders.to_csv(r'C:\Users\Suyash\BALC Project\Cleaned\orders.csv')
order_items.to_csv(r'C:\Users\Suyash\BALC Project\Cleaned\order_items.csv')
order_payments.to_csv(r'C:\Users\Suyash\BALC Project\Cleaned\order_payments.csv')
order_reviews.to_csv(r'C:\Users\Suyash\BALC Project\Cleaned\order_reviews.csv')
products.to_csv(r'C:\Users\Suyash\BALC Project\Cleaned\products.csv')
sellers.to_csv(r'C:\Users\Suyash\BALC Project\Cleaned\sellers.csv')
In [33]:
########################################
In [34]:
final = pd.merge(customer, orders, on ='customer_id')
final = pd.merge(final,order_items,on ='order_id')
final = pd.merge(final,order_payments,on ='order_id')
final = pd.merge(final,order_reviews,on ='order_id')
final = pd.merge(final,products,on ='product_id')
final = pd.merge(final,sellers,on ='seller_id')
final = pd.merge(final,english_desc,on='product_category_name')
final.dropna()
final.to_csv(r'C:\Users\Suyash\BALC Project\Cleaned\final.csv')
In [35]:
##########################################
In [36]:
###################################################
In [37]:
final.columns
Out[37]:
Index(['customer_id', 'customer_unique_id', 'customer_zip_code_prefix',
       'customer_city', 'customer_state', 'order_id', 'order_status',
       'order_estimated_delivery_date', 'order_purchase_date',
       'order_delivered_date', 'order_item_id', 'product_id', 'seller_id',
       'price', 'freight_value', 'payment_sequential', 'payment_type',
       'payment_installments', 'payment_value', 'review_id', 'review_score',
       'product_category_name', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm',
       'seller_zip_code_prefix', 'seller_city', 'seller_state',
       'product_category_name_english'],
      dtype='object')
In [38]:
df_info_table = pd.DataFrame({}) # Creating an Empty Dataframe
df_info_table['Dataframe'] = 'Final' # Creating a Column with Dataframe(WIll work as in index)
df_info_table['Rows'] = final.shape[0] #List Comprehension
df_info_table['Columns'] = final.shape[1]
df_info_table['Null Value Count'] = final.isnull().sum().sum()
df_info_table['Null Columns'] = [len([col for col, null in final.isnull().sum().items() if null > 0])]
df_info_table['Null Columns Name'] = [', '.join([col for col, null in final.isnull().sum().items() if null > 0])]


#Chosing a Colour Paletter with Seaborn Library
cm = sns.light_palette("maroon", as_cmap=True)

#Mapping the style to the dataframe
df_info_table.style.background_gradient(cmap=cm)
D:\Anaconda3\lib\site-packages\matplotlib\colors.py:479: RuntimeWarning:

invalid value encountered in less

Out[38]:
Dataframe Rows Columns Null Value Count Null Columns Null Columns Name
0 nan nan nan nan 5 order_delivered_date, product_weight_g, product_length_cm, product_height_cm, product_width_cm
In [39]:
final.head()
Out[39]:
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state order_id order_status order_estimated_delivery_date order_purchase_date order_delivered_date ... product_description_lenght product_photos_qty product_weight_g product_length_cm product_height_cm product_width_cm seller_zip_code_prefix seller_city seller_state product_category_name_english
0 06b8999e2fba1a1fbc88172c00ba8bc7 861eff4711a542e4b93843c6dd7febb0 14409 franca SP 00e7ee1b050b8499577073aeb2a297a1 delivered 2017-06-05 2017-05-16 2017-05-25 ... 1141.0 1.0 8683.0 54.0 64.0 31.0 8577 itaquaquecetuba SP office_furniture
1 8912fc0c3bbf1e2fbf35819e21706718 9eae34bbd3a474ec5d07949ca7de67c0 68030 santarem PA c1d2b34febe9cd269e378117d6681172 delivered 2017-12-19 2017-11-09 2017-11-28 ... 1141.0 1.0 8683.0 54.0 64.0 31.0 8577 itaquaquecetuba SP office_furniture
2 8912fc0c3bbf1e2fbf35819e21706718 9eae34bbd3a474ec5d07949ca7de67c0 68030 santarem PA c1d2b34febe9cd269e378117d6681172 delivered 2017-12-19 2017-11-09 2017-11-28 ... 1141.0 1.0 8683.0 54.0 64.0 31.0 8577 itaquaquecetuba SP office_furniture
3 f0ac8e5a239118859b1734e1087cbb1f 3c799d181c34d51f6d44bbbc563024db 92480 nova santa rita RS b1a5d5365d330d10485e0203d54ab9e8 delivered 2017-06-12 2017-05-07 2017-05-26 ... 1141.0 1.0 8683.0 54.0 64.0 31.0 8577 itaquaquecetuba SP office_furniture
4 6bc8d08963a135220ed6c6d098831f84 23397e992b09769faf5e66f9e171a241 25931 mage RJ 2e604b3614664aa66867856dba7e61b7 delivered 2018-03-22 2018-02-03 2018-02-28 ... 1141.0 1.0 8683.0 54.0 64.0 31.0 8577 itaquaquecetuba SP office_furniture

5 rows × 33 columns

In [40]:
final.dtypes
Out[40]:
customer_id                              object
customer_unique_id                       object
customer_zip_code_prefix                  int64
customer_city                            object
customer_state                           object
order_id                                 object
order_status                             object
order_estimated_delivery_date            object
order_purchase_date              datetime64[ns]
order_delivered_date             datetime64[ns]
order_item_id                             int64
product_id                               object
seller_id                                object
price                                   float64
freight_value                           float64
payment_sequential                        int64
payment_type                             object
payment_installments                      int64
payment_value                           float64
review_id                                object
review_score                              int64
product_category_name                    object
product_name_lenght                     float64
product_description_lenght              float64
product_photos_qty                      float64
product_weight_g                        float64
product_length_cm                       float64
product_height_cm                       float64
product_width_cm                        float64
seller_zip_code_prefix                    int64
seller_city                              object
seller_state                             object
product_category_name_english            object
dtype: object

Exploratory Data Analysis

We will try to see the orders by Year, Month and Then Days

In [41]:
orders.dtypes
Out[41]:
order_id                                 object
customer_id                              object
order_status                             object
order_estimated_delivery_date            object
order_purchase_date              datetime64[ns]
order_delivered_date             datetime64[ns]
dtype: object
In [42]:
time_series_df = pd.DataFrame(columns=['Date','Orders'])
In [43]:
time_series_df['Date'] = orders['order_purchase_date'].dt.date
time_series_df['Orders']=1
In [44]:
time_series_df = time_series_df.groupby(['Date']).sum()
In [45]:
time_series_df = time_series_df.reset_index()
In [46]:
time_series_df['Year'] = orders['order_purchase_date'].dt.year

Year Plot

In [47]:
fig = px.histogram(time_series_df, x="Year", y="Orders", histfunc="sum", title="Orders By Year",color=time_series_df['Year'])
fig.update_traces(xbins_size=0)
fig.update_xaxes(showgrid=True, ticklabelmode="period", dtick="M1", tickformat="Y")
fig.update_layout(bargap=0.5)
fig.show()

Month Plot

In [48]:
fig = px.histogram(time_series_df, x="Date", y="Orders", histfunc="sum", title="Orders By Month",color=time_series_df['Orders'])
fig.update_traces(xbins_size="M1")
fig.update_xaxes(showgrid=True, ticklabelmode="period", dtick="M1", tickformat="%b\n%Y")
fig.update_layout(bargap=0.3)
fig.show()

Plotting Category By Time

In [49]:
order_items.columns
Out[49]:
Index(['order_id', 'order_item_id', 'product_id', 'seller_id', 'price',
       'freight_value'],
      dtype='object')
In [50]:
products.columns
Out[50]:
Index(['product_id', 'product_category_name', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm'],
      dtype='object')
In [51]:
orders.columns
Out[51]:
Index(['order_id', 'customer_id', 'order_status',
       'order_estimated_delivery_date', 'order_purchase_date',
       'order_delivered_date'],
      dtype='object')
In [52]:
time_series_category = order_items.merge(products,on=['product_id'],how='left')
time_series_category = time_series_category.merge(orders,on=['order_id'],how='left')
In [53]:
time_series_category['order'] = 1
In [54]:
time_series_category.columns
Out[54]:
Index(['order_id', 'order_item_id', 'product_id', 'seller_id', 'price',
       'freight_value', 'product_category_name', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm',
       'customer_id', 'order_status', 'order_estimated_delivery_date',
       'order_purchase_date', 'order_delivered_date', 'order'],
      dtype='object')
In [55]:
time_series_category = time_series_category[['product_category_name', 'order_purchase_date','order']]
In [56]:
time_series_category = time_series_category.groupby(['order_purchase_date','product_category_name']).sum().reset_index()
In [57]:
time_series_category.head(10)
Out[57]:
order_purchase_date product_category_name order
0 2016-09-04 moveis_decoracao 2
1 2016-09-05 telefonia 1
2 2016-09-15 beleza_saude 3
3 2016-10-02 bebes 1
4 2016-10-03 brinquedos 1
5 2016-10-03 esporte_lazer 3
6 2016-10-03 fashion_calcados 1
7 2016-10-03 moveis_decoracao 2
8 2016-10-03 relogios_presentes 1
9 2016-10-04 automotivo 1
In [58]:
fig = px.histogram(time_series_category, x='order_purchase_date', y="order", histfunc="sum", title="Orders By Month",color=time_series_category['product_category_name'])
fig.update_traces(xbins_size="M1")
fig.update_xaxes(showgrid=True, ticklabelmode="period", dtick="M1", tickformat="%b\n%Y")
fig.update_layout(bargap=0.3)
fig.show()

Date Plot

In [59]:
fig = px.bar(time_series_df, x = 'Date',y="Orders",color='Orders',title='Orders By Date')
fig.show()

Analysis Summary

We See Unsually high numbers of orders on 24th November 2017, So it is plausible that there was a festival or a holiday on that day or people were buying before presents before christmas.

Deliver Times Summary

In [60]:
delivery_analysis = pd.DataFrame(columns =['Purchase','Delivered'])
In [61]:
delivery_analysis['Purchase'] = orders['order_purchase_date']
delivery_analysis['Delivered'] = orders['order_delivered_date']
In [62]:
delivery_analysis['Time Taken'] = (delivery_analysis['Delivered'] - delivery_analysis['Purchase']).dt.days
In [63]:
delivery_analysis = delivery_analysis.dropna()
In [64]:
delivery_analysis.head(5)
Out[64]:
Purchase Delivered Time Taken
0 2017-10-02 2017-10-10 8.0
1 2018-07-24 2018-08-07 14.0
2 2018-08-08 2018-08-17 9.0
3 2017-11-18 2017-12-02 14.0
4 2018-02-13 2018-02-16 3.0

Lets see the Summary of our Deliveries

In [65]:
 delivery_analysis['Time Taken'].describe()
Out[65]:
count    96476.000000
mean        12.497336
std          9.555460
min          0.000000
25%          7.000000
50%         10.000000
75%         16.000000
max        210.000000
Name: Time Taken, dtype: float64

So on an average we take 12.5 days to deliver a product and we deliver 50% of the products below this threshold, we also see bizzare values of 100 and upto 210 Days we can also see it in the box plot below

In [66]:
'''
fig = go.Figure(data=[go.Box(y=delivery_analysis['Time Taken'],
            boxpoints='all', # can also be outliers, or suspectedoutliers, or False
            jitter=0.3, # add some jitter for a better separation between points
            pointpos=-1.8 # relative position of points wrt box
              )])

fig.show()
'''

#Interactive BoxPlot - Not an Optimal Code do not run
Out[66]:
"\nfig = go.Figure(data=[go.Box(y=delivery_analysis['Time Taken'],\n            boxpoints='all', # can also be outliers, or suspectedoutliers, or False\n            jitter=0.3, # add some jitter for a better separation between points\n            pointpos=-1.8 # relative position of points wrt box\n              )])\n\nfig.show()\n"
In [67]:
ax = sns.boxplot(y="Time Taken", data=delivery_analysis, whis=np.inf,color='cyan')
ax = sns.stripplot(y="Time Taken", data=delivery_analysis, color="maroon",alpha=.05)

Freight Cost to Wait Time Analysis

We are trying to Analyze which are the highest performing cities and why is our wait time large for 50% of our demographic

In [68]:
 city_analysis = pd.DataFrame(columns=[
       'City', 'State', 'Time Taken','Total Price'])
In [69]:
city_analysis['City'] = final['customer_city']
city_analysis['State'] = final['customer_state']
city_analysis['Time Taken'] = (final['order_delivered_date'] - final['order_purchase_date']).dt.days
city_analysis['Cost'] = final['price'] 
city_analysis['Freight'] = final['freight_value']
city_analysis['Order'] = 1
In [70]:
city_analysis.head(1)
Out[70]:
City State Time Taken Total Price Cost Freight Order
0 franca SP 9.0 NaN 124.99 21.88 1
In [71]:
city_analysis = city_analysis.groupby('City').sum()
In [72]:
city_analysis['Freight Avg'] = city_analysis['Freight']/city_analysis['Order']
city_analysis['Time Taken'] = city_analysis['Time Taken']/city_analysis['Order']
In [73]:
city_analysis = city_analysis.dropna()
In [74]:
city_analysis.head(1)
Out[74]:
Time Taken Cost Freight Order Freight Avg
City
abadia dos dourados 12.0 358.9 54.6 3 18.2

We check for the cities that ordered the least porducts

In [75]:
city_analysis['Order'].describe()
Out[75]:
count     4095.000000
mean        28.469109
std        331.285924
min          1.000000
25%          1.000000
50%          3.000000
75%         10.000000
max      18412.000000
Name: Order, dtype: float64

We check for the Freight Average of the lowest puchasign cities vs Highest Purchasing cities

In [76]:
print('Cities With less than 3 Orders = ',(city_analysis['Order'] <= 10).sum())
Cities With less than 3 Orders =  3089
In [77]:
print('Cities With Greater than 10 Orders = ',(city_analysis['Order'] > 10).sum())
Cities With Greater than 10 Orders =  1006
In [78]:
city_analysis['High/Lo'] = city_analysis['Order'].apply(lambda x: 'High' if x > 10 else 'Low')
In [79]:
city_analysis.head(1)
Out[79]:
Time Taken Cost Freight Order Freight Avg High/Lo
City
abadia dos dourados 12.0 358.9 54.6 3 18.2 Low
In [80]:
ax = sns.boxplot(x='High/Lo',y="Freight Avg", data=city_analysis, whis=np.inf)

Plotting on Nautral Log Axis

In [81]:
ax = sns.boxplot(x='High/Lo',y="Freight Avg", data=city_analysis, whis=np.inf)
ax.set_yscale('log')

Analysis Summary:

We see that the Cost for Low ordering cities is definitely a little higher but we cannot establish causality here

Wait Time Analysis

We also should do an analysis of wait times as it could probably be that wait times could be resultin in low orders

In [82]:
city_analysis['Time Taken'].describe()
Out[82]:
count    4095.000000
mean       15.742718
std         8.065810
min         0.000000
25%        11.000000
50%        14.000000
75%        18.500000
max       148.000000
Name: Time Taken, dtype: float64

We have a standard deviation of 8 so we can say that all delivery times above 21 are high wait times and all below 8 are low wait times based on this information we proceed ahead

In [83]:
city_analysis['Wait'] = city_analysis['Time Taken'].apply(lambda x: 'High' if x >= 21 else ('Low' if x <= 8 else 'Normal'))
In [84]:
x = sns.boxplot(x='Wait',y="Order", data=city_analysis, whis=np.inf)
x.set_yscale('log')

Analysis Summary:

We See that High wait time cities do infact order less than Low and Normal wait time cities, we need to perform t-test to validate this, but we might be onto something.

Per State Wait Time Analysis

In [85]:
 city_wait_time = customer.merge(orders,on='customer_id',how='left')
In [86]:
city_wait_time.columns
Out[86]:
Index(['customer_id', 'customer_unique_id', 'customer_zip_code_prefix',
       'customer_city', 'customer_state', 'order_id', 'order_status',
       'order_estimated_delivery_date', 'order_purchase_date',
       'order_delivered_date'],
      dtype='object')
In [87]:
city_wait_time = city_wait_time[['customer_city', 'customer_state', 'order_id','order_purchase_date',
       'order_delivered_date']]
In [88]:
city_wait_time['time_taken']  = (city_wait_time['order_delivered_date'] - city_wait_time['order_purchase_date']).dt.days
In [89]:
 city_wait_time.columns
Out[89]:
Index(['customer_city', 'customer_state', 'order_id', 'order_purchase_date',
       'order_delivered_date', 'time_taken'],
      dtype='object')
In [90]:
city_wait_time = city_wait_time[['customer_city', 'customer_state', 'time_taken']]
In [91]:
city_wait_time['total_order'] = 1 
In [92]:
city_wait_time.head(1)
Out[92]:
customer_city customer_state time_taken total_order
0 franca SP 9.0 1
In [93]:
fig, ax = plt.subplots()
x = sns.boxplot(x='customer_state',y="time_taken", data=city_wait_time, whis=np.inf)
fig.set_size_inches(11.7, 8.27)
x.set_title("Time Variation By State",fontsize=20)
x.set_xlabel("Sate",fontsize=14)
x.set_ylabel("Time Taken In Days",fontsize=14)
Out[93]:
Text(0, 0.5, 'Time Taken In Days')
In [94]:
fig, ax = plt.subplots()
g = sns.countplot(x='customer_state', data=final, orient='h')
g.set_title("Customer's State Distribution",fontsize=20)
g.set_xlabel("State",fontsize=14)
g.set_ylabel("Count",fontsize=14)
fig.set_size_inches(11.7, 8.27)
In [95]:
fig, ax = plt.subplots()
g = sns.countplot(x='seller_state', data=final, orient='h')
g.set_title("Sellers's State Distribution",fontsize=20)
g.set_xlabel("State",fontsize=14)
g.set_ylabel("Count",fontsize=14)
fig.set_size_inches(11.7, 8.27)
In [96]:
k=pd.DataFrame({'customers':customer['customer_state'].value_counts(),'sellers':sellers['seller_state'].value_counts()})
k=k.sort_values(by='customers',ascending=False)
k=k.fillna(0)
k['sellers']= k['sellers'].apply( lambda x:x/k['sellers'].sum())
k['customers']= k['customers'].apply( lambda x:x/k['customers'].sum())
labels = k.T.columns
sel = k['sellers']
cus = k['customers']

x = np.arange(len(labels))  # the label locations
width = 0.2  # the width of the bars

fig, ax = plt.subplots(figsize=(20,10))

rects1 = ax.bar(x - width/2, sel, width, label='Sellers')
rects2 = ax.bar(x + width/2, cus, width, label='Customers')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Percentage of total per state',fontsize=14)
ax.set_xlabel('States',fontsize=14)
ax.set_title('Customers and sellers location by state',fontsize=20)
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()
Out[96]:
<matplotlib.legend.Legend at 0x18c66c904a8>
In [97]:
fig, ax = plt.subplots()
g3 = sns.boxplot(x='customer_state', y='freight_value', 
                 data=final[final['price'] != -1])
g3.set_title("CUSTOMER's State by Freight Value", fontsize=20)
g3.set_xlabel("State",fontsize=14)
g3.set_ylabel("Freight Value",fontsize=14)
fig.set_size_inches(11.7, 8.27)
In [98]:
 
# The nicest state ? AP: macapa laranjal do jari

plt.figure(figsize=(14,8))

sns.barplot(data = final, x='customer_state', y='review_score',\
           order = final.groupby('customer_state').mean()\
            .reset_index().sort_values('review_score', ascending=False)['customer_state'].values,
           errcolor = 'grey');


plt.title('Most Satisfied State'.title(), fontsize=20);
plt.ylabel('Average Review Score'.title(), fontsize=14);
plt.xlabel('customer state'.title(), fontsize=14);

Cluster Analysis

In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
sns.set(style="ticks")
import gc
import itertools
from datetime import datetime
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.simplefilter("ignore")
pd.set_option('display.max_columns', 100)
np.random.seed(42)
import plotly
from datetime import datetime, timedelta
import plotly.offline as pyoff
import plotly.graph_objs as go
#initiate visualization library for jupyter notebook 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
pyoff.init_notebook_mode(connected=True)
%matplotlib inline



#defining visualizaition functions
def format_spines(ax, right_border=True):
    
    ax.spines['bottom'].set_color('#666666')
    ax.spines['left'].set_color('#666666')
    ax.spines['top'].set_visible(False)
    if right_border:
        ax.spines['right'].set_color('#FFFFFF')
    else:
        ax.spines['right'].set_color('#FFFFFF')
    ax.patch.set_facecolor('#FFFFFF')
    

def count_plot(feature, df, colors='Blues_d', hue=False, ax=None, title=''):
    
    # Preparing variables
    ncount = len(df)
    if hue != False:
        ax = sns.countplot(x=feature, data=df, palette=colors, hue=hue, ax=ax)
    else:
        ax = sns.countplot(x=feature, data=df, palette=colors, ax=ax)
        
    format_spines(ax)

    # Setting percentage
    for p in ax.patches:
        x=p.get_bbox().get_points()[:,0]
        y=p.get_bbox().get_points()[1,1]
        ax.annotate('{:.1f}%'.format(100.*y/ncount), (x.mean(), y), 
                ha='center', va='bottom') # set the alignment of the text
    
    # Final configuration
    if not hue:
        ax.set_title(df[feature].describe().name + ' Analysis', size=13, pad=15)
    else:
        ax.set_title(df[feature].describe().name + ' Analysis by ' + hue, size=13, pad=15)  
    if title != '':
        ax.set_title(title)       
    plt.tight_layout()
    
    
def bar_plot(x, y, df, colors='Blues_d', hue=False, ax=None, value=False, title=''):
    
    # Preparing variables
    try:
        ncount = sum(df[y])
    except:
        ncount = sum(df[x])
    #fig, ax = plt.subplots()
    if hue != False:
        ax = sns.barplot(x=x, y=y, data=df, palette=colors, hue=hue, ax=ax, ci=None)
    else:
        ax = sns.barplot(x=x, y=y, data=df, palette=colors, ax=ax, ci=None)

    # Setting borders
    format_spines(ax)

    # Setting percentage
    for p in ax.patches:
        xp=p.get_bbox().get_points()[:,0]
        yp=p.get_bbox().get_points()[1,1]
        if value:
            ax.annotate('{:.2f}k'.format(yp/1000), (xp.mean(), yp), 
                    ha='center', va='bottom') # set the alignment of the text
        else:
            ax.annotate('{:.1f}%'.format(100.*yp/ncount), (xp.mean(), yp), 
                    ha='center', va='bottom') # set the alignment of the text
    if not hue:
        ax.set_title(df[x].describe().name + ' Analysis', size=12, pad=15)
    else:
        ax.set_title(df[x].describe().name + ' Analysis by ' + hue, size=12, pad=15)
    if title != '':
        ax.set_title(title)  
    plt.tight_layout()

Feature Engineering

In [100]:
# loading data 
customers_ = pd.read_csv("olist_customers_dataset.csv")
order_items_ = pd.read_csv("olist_order_items_dataset.csv")
order_payments_ = pd.read_csv("olist_order_payments_dataset.csv")
orders_ = pd.read_csv("olist_orders_dataset.csv")
In [101]:
# displaying data shape
#dataset = [customers, geolocation, order_items, order_payments, order_reviews, orders, products, sellers, category_name_translation]
dataset = {
    'Customers': customers_,
    'Order Items': order_items_,
    'Payments': order_payments_,
    'Orders': orders_
}

for x, y in dataset.items():
    print(f'{x}', (list(y.shape)))
Customers [99441, 5]
Order Items [112650, 7]
Payments [103886, 5]
Orders [99441, 8]
In [102]:
# displaying dataset column names
for x, y in dataset.items():
    print(f'{x}', f'{list(y.columns)}\n')
Customers ['customer_id', 'customer_unique_id', 'customer_zip_code_prefix', 'customer_city', 'customer_state']

Order Items ['order_id', 'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date', 'price', 'freight_value']

Payments ['order_id', 'payment_sequential', 'payment_type', 'payment_installments', 'payment_value']

Orders ['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']

In [103]:
# checking for null values in datasets
for x, y in dataset.items():
    print(f'{x}: {y.isnull().any().any()}')
Customers: False
Order Items: False
Payments: False
Orders: True
In [104]:
# taking count for dataset with missing values
for x, y in dataset.items():
    if y.isnull().any().any():
        print(f'{x}', (list(y.shape)),'\n')
        print(f'{y.isnull().sum()}\n')
Orders [99441, 8] 

order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                 160
order_delivered_carrier_date     1783
order_delivered_customer_date    2965
order_estimated_delivery_date       0
dtype: int64

In [105]:
# creating master dataframe 
order_payments_.head()
df1 = order_payments_.merge(order_items_, on='order_id')
df2 = df1.merge(orders_, on='order_id')
df = df2.merge(customers_, on='customer_id')
print(df.shape)
(117601, 22)
In [106]:
# converting date columns to datetime
date_columns = ['shipping_limit_date', 'order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']
for col in date_columns:
    df[col] = pd.to_datetime(df[col], format='%Y-%m-%d %H:%M:%S')
In [107]:
df['customer_city'] = df['customer_city'].str.title()
df['payment_type'] = df['payment_type'].str.replace('_', ' ').str.title()
# engineering new/essential columns
df['delivery_against_estimated'] = (df['order_estimated_delivery_date'] - df['order_delivered_customer_date']).dt.days
df['order_purchase_year'] = df.order_purchase_timestamp.apply(lambda x: x.year)
df['order_purchase_month'] = df.order_purchase_timestamp.apply(lambda x: x.month)
df['order_purchase_dayofweek'] = df.order_purchase_timestamp.apply(lambda x: x.dayofweek)
df['order_purchase_hour'] = df.order_purchase_timestamp.apply(lambda x: x.hour)
df['order_purchase_day'] = df['order_purchase_dayofweek'].map({0:'Mon',1:'Tue',2:'Wed',3:'Thu',4:'Fri',5:'Sat',6:'Sun'})
df['order_purchase_mon'] = df.order_purchase_timestamp.apply(lambda x: x.month).map({1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'Jun',7:'Jul',8:'Aug',9:'Sep',10:'Oct',11:'Nov',12:'Dec'})
# Changing the month attribute for correct ordenation
df['month_year'] = df['order_purchase_month'].astype(str).apply(lambda x: '0' + x if len(x) == 1 else x)
df['month_year'] = df['order_purchase_year'].astype(str) + '-' + df['month_year'].astype(str)
#creating year month column
df['month_y'] = df['order_purchase_timestamp'].map(lambda date: 100*date.year + date.month)
# displaying summary staticstics of columns
In [108]:
# displaying summary staticstics of columns
df.describe(include='all')
Out[108]:
order_id payment_sequential payment_type payment_installments payment_value order_item_id product_id seller_id shipping_limit_date price freight_value customer_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date customer_unique_id customer_zip_code_prefix customer_city customer_state delivery_against_estimated order_purchase_year order_purchase_month order_purchase_dayofweek order_purchase_hour order_purchase_day order_purchase_mon month_year month_y
count 117601 117601.000000 117601 117601.000000 117601.000000 117601.000000 117601 117601 117601 117601.000000 117601.000000 117601 117601 117601 117586 116356 115034 117601 117601 117601.000000 117601 117601 115034.000000 117601.000000 117601.000000 117601.000000 117601.000000 117601 117601 117601 117601.000000
unique 98665 NaN 4 NaN NaN NaN 32951 3095 93317 NaN NaN 98665 7 98111 90173 81016 95663 449 95419 NaN 4110 27 NaN NaN NaN NaN NaN 7 12 24 NaN
top 895ab968e7bb0d5659d16cd74cd1650c NaN Credit Card NaN NaN NaN aca2eb7d00ea1a7b8ebd4e68314663af 4a3ca9315b744ce9f8e9374361493884 2017-08-14 20:43:31 NaN NaN 270c23a11d024a44c896d1894b261a83 delivered 2017-08-08 20:26:31 2017-08-08 20:43:31 2017-08-10 11:58:14 2017-08-14 12:46:18 2017-12-20 00:00:00 9a736b248f67d166d2fbb006bcb877c3 NaN Sao Paulo SP NaN NaN NaN NaN NaN Mon Aug 2017-11 NaN
freq 63 NaN 86769 NaN NaN NaN 536 2133 63 NaN NaN 63 115035 63 63 63 63 649 75 NaN 18590 49566 NaN NaN NaN NaN NaN 19130 12632 9016 NaN
first NaN NaN NaN NaN NaN NaN NaN NaN 2016-09-19 00:15:34 NaN NaN NaN NaN 2016-09-04 21:15:19 2016-10-04 09:43:32 2016-10-08 10:34:01 2016-10-11 13:46:32 2016-10-20 00:00:00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
last NaN NaN NaN NaN NaN NaN NaN NaN 2020-04-09 22:35:08 NaN NaN NaN NaN 2018-09-03 09:06:57 2018-09-03 17:40:06 2018-09-11 19:48:28 2018-10-17 13:22:46 2018-10-25 00:00:00 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
mean NaN 1.093528 NaN 2.939482 172.686752 1.195900 NaN NaN NaN 120.824783 20.045990 NaN NaN NaN NaN NaN NaN NaN NaN 35051.793097 NaN NaN 11.043326 2017.538193 6.028129 2.745750 14.760002 NaN NaN NaN 201759.847399
std NaN 0.726692 NaN 2.774223 267.592290 0.697706 NaN NaN NaN 184.479323 15.861315 NaN NaN NaN NaN NaN NaN NaN NaN 29820.588877 NaN NaN 10.162307 0.505065 3.229579 1.961257 5.325670 NaN NaN NaN 48.798820
min NaN 1.000000 NaN 0.000000 0.000000 1.000000 NaN NaN NaN 0.850000 0.000000 NaN NaN NaN NaN NaN NaN NaN NaN 1003.000000 NaN NaN -189.000000 2016.000000 1.000000 0.000000 0.000000 NaN NaN NaN 201609.000000
25% NaN 1.000000 NaN 1.000000 60.870000 1.000000 NaN NaN NaN 39.900000 13.080000 NaN NaN NaN NaN NaN NaN NaN NaN 11310.000000 NaN NaN 6.000000 2017.000000 3.000000 1.000000 11.000000 NaN NaN NaN 201709.000000
50% NaN 1.000000 NaN 2.000000 108.210000 1.000000 NaN NaN NaN 74.900000 16.290000 NaN NaN NaN NaN NaN NaN NaN NaN 24315.000000 NaN NaN 12.000000 2018.000000 6.000000 3.000000 15.000000 NaN NaN NaN 201801.000000
75% NaN 1.000000 NaN 4.000000 189.260000 1.000000 NaN NaN NaN 134.900000 21.190000 NaN NaN NaN NaN NaN NaN NaN NaN 58600.000000 NaN NaN 16.000000 2018.000000 8.000000 4.000000 19.000000 NaN NaN NaN 201805.000000
max NaN 29.000000 NaN 24.000000 13664.080000 21.000000 NaN NaN NaN 6735.000000 409.680000 NaN NaN NaN NaN NaN NaN NaN NaN 99990.000000 NaN NaN 146.000000 2018.000000 12.000000 6.000000 23.000000 NaN NaN NaN 201809.000000
In [109]:
# displaying missing value counts and corresponding percentage against total observations
missing_values = df.isnull().sum().sort_values(ascending = False)
percentage = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending = False)
pd.concat([missing_values, percentage], axis=1, keys=['Values', 'Percentage']).transpose()
Out[109]:
delivery_against_estimated order_delivered_customer_date order_delivered_carrier_date order_approved_at payment_type payment_installments payment_value order_item_id product_id order_purchase_timestamp seller_id shipping_limit_date price freight_value customer_id payment_sequential order_status month_y month_year order_estimated_delivery_date customer_unique_id customer_zip_code_prefix customer_city customer_state order_purchase_year order_purchase_month order_purchase_dayofweek order_purchase_hour order_purchase_day order_purchase_mon order_id
Values 2567.000000 2567.000000 1245.000000 15.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
Percentage 2.182805 2.182805 1.058664 0.012755 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
In [110]:
# dropping missing values
df.dropna(inplace=True)
df.isnull().values.any()
Out[110]:
False
In [111]:
# displaying dataframe info
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 115018 entries, 0 to 117600
Data columns (total 31 columns):
order_id                         115018 non-null object
payment_sequential               115018 non-null int64
payment_type                     115018 non-null object
payment_installments             115018 non-null int64
payment_value                    115018 non-null float64
order_item_id                    115018 non-null int64
product_id                       115018 non-null object
seller_id                        115018 non-null object
shipping_limit_date              115018 non-null datetime64[ns]
price                            115018 non-null float64
freight_value                    115018 non-null float64
customer_id                      115018 non-null object
order_status                     115018 non-null object
order_purchase_timestamp         115018 non-null datetime64[ns]
order_approved_at                115018 non-null datetime64[ns]
order_delivered_carrier_date     115018 non-null datetime64[ns]
order_delivered_customer_date    115018 non-null datetime64[ns]
order_estimated_delivery_date    115018 non-null datetime64[ns]
customer_unique_id               115018 non-null object
customer_zip_code_prefix         115018 non-null int64
customer_city                    115018 non-null object
customer_state                   115018 non-null object
delivery_against_estimated       115018 non-null float64
order_purchase_year              115018 non-null int64
order_purchase_month             115018 non-null int64
order_purchase_dayofweek         115018 non-null int64
order_purchase_hour              115018 non-null int64
order_purchase_day               115018 non-null object
order_purchase_mon               115018 non-null object
month_year                       115018 non-null object
month_y                          115018 non-null int64
dtypes: datetime64[ns](6), float64(4), int64(9), object(12)
memory usage: 28.1+ MB
In [112]:
# excluding incomplete 2012 data and displaying first 3 rows of master dataframe
df = df.query("month_year != '2016-12' and month_year != '2016-10'")
df.head(3)
Out[112]:
order_id payment_sequential payment_type payment_installments payment_value order_item_id product_id seller_id shipping_limit_date price freight_value customer_id order_status order_purchase_timestamp order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date customer_unique_id customer_zip_code_prefix customer_city customer_state delivery_against_estimated order_purchase_year order_purchase_month order_purchase_dayofweek order_purchase_hour order_purchase_day order_purchase_mon month_year month_y
0 b81ef226f3fe1789b1e8b2acac839d17 1 Credit Card 8 99.33 1 af74cc53dcffc8384b29e7abfa41902b 213b25e6f54661939f11710a6fddb871 2018-05-02 22:15:09 79.80 19.53 0a8556ac6be836b46b3e89920d59291c delivered 2018-04-25 22:01:49 2018-04-25 22:15:09 2018-05-02 15:20:00 2018-05-09 17:36:51 2018-05-22 708ab75d2a007f0564aedd11139c7708 39801 Teofilo Otoni MG 12.0 2018 4 2 22 Wed Apr 2018-04 201804
1 a9810da82917af2d9aefd1278f1dcfa0 1 Credit Card 1 24.39 1 a630cc320a8c872f9de830cf121661a3 eaf6d55068dea77334e8477d3878d89e 2018-07-02 11:18:58 17.00 7.39 f2c7fc58a9de810828715166c672f10a delivered 2018-06-26 11:01:38 2018-06-26 11:18:58 2018-06-28 14:18:00 2018-06-29 20:32:09 2018-07-16 a8b9d3a27068454b1c98cc67d4e31e6f 2422 Sao Paulo SP 16.0 2018 6 1 11 Tue Jun 2018-06 201806
2 25e8ea4e93396b6fa0d3dd708e76c1bd 1 Credit Card 1 65.71 1 2028bf1b01cafb2d2b1901fca4083222 cc419e0650a3c5ba77189a1882b7556a 2017-12-26 09:52:34 56.99 8.72 25b14b69de0b6e184ae6fe2755e478f9 delivered 2017-12-12 11:19:55 2017-12-14 09:52:34 2017-12-15 20:13:22 2017-12-18 17:24:41 2018-01-04 6f70c0b2f7552832ba46eb57b1c5651e 2652 Sao Paulo SP 16.0 2017 12 1 11 Tue Dec 2017-12 201712

Monthly Revenue

In [113]:
#calculate Revenue for each row and create a new dataframe with YearMonth - Revenue columns
df_revenue = df.groupby(['month_year'])['payment_value'].sum().reset_index()
df_revenue
Out[113]:
month_year payment_value
0 2017-01 176376.56
1 2017-02 323815.95
2 2017-03 505735.83
3 2017-04 456108.32
4 2017-05 701119.60
5 2017-06 585400.98
6 2017-07 716069.98
7 2017-08 842689.94
8 2017-09 996085.61
9 2017-10 998609.62
10 2017-11 1548547.86
11 2017-12 1020067.26
12 2018-01 1374064.02
13 2018-02 1280014.54
14 2018-03 1435458.33
15 2018-04 1466607.15
16 2018-05 1480667.59
17 2018-06 1285396.78
18 2018-07 1306707.42
19 2018-08 1211240.09

Monthly Revenue Growth Rate

In [114]:
#calculating for monthly revenie growth rate
# using pct_change() function to see monthly percentage change
df_revenue['MonthlyGrowth'] = df_revenue['payment_value'].pct_change()

df_revenue
Out[114]:
month_year payment_value MonthlyGrowth
0 2017-01 176376.56 NaN
1 2017-02 323815.95 0.835935
2 2017-03 505735.83 0.561800
3 2017-04 456108.32 -0.098129
4 2017-05 701119.60 0.537178
5 2017-06 585400.98 -0.165048
6 2017-07 716069.98 0.223213
7 2017-08 842689.94 0.176826
8 2017-09 996085.61 0.182031
9 2017-10 998609.62 0.002534
10 2017-11 1548547.86 0.550704
11 2017-12 1020067.26 -0.341275
12 2018-01 1374064.02 0.347033
13 2018-02 1280014.54 -0.068446
14 2018-03 1435458.33 0.121439
15 2018-04 1466607.15 0.021700
16 2018-05 1480667.59 0.009587
17 2018-06 1285396.78 -0.131880
18 2018-07 1306707.42 0.016579
19 2018-08 1211240.09 -0.073059

Monthly Active Customers

In [115]:
#creating monthly active customers dataframe by counting unique Customer IDs
df_monthly_active = df.groupby('month_year')['customer_unique_id'].nunique().reset_index()

fig, ax = plt.subplots(figsize=(12, 6))
sns.set(palette='muted', color_codes=True, style='whitegrid')
bar_plot(x='month_year', y='customer_unique_id', df=df_monthly_active, value=True)
ax.tick_params(axis='x', labelrotation=90)
plt.show()

Monthly Order Count

In [116]:
#creating monthly active customers dataframe by counting unique Customer IDs
df_monthly_sales = df.groupby('month_year')['order_status'].count().reset_index()

fig, ax = plt.subplots(figsize=(12, 6))
sns.set(palette='muted', color_codes=True, style='whitegrid')
bar_plot(x='month_year', y='order_status', df=df_monthly_sales, value=True)
ax.tick_params(axis='x', labelrotation=90)
plt.show()

Average Revenue per Customer Purchase

In [117]:
# create a new dataframe for average revenue by taking the mean of it
df_monthly_order_avg = df.groupby('month_year')['payment_value'].mean().reset_index()

fig, ax = plt.subplots(figsize=(12, 6))
sns.set(palette='muted', color_codes=True, style='whitegrid')
bar_plot(x='month_year', y='payment_value', df=df_monthly_order_avg, value=True)
ax.tick_params(axis='x', labelrotation=90)
plt.show()

New Customer Ratio

In [118]:
#create a dataframe contaning CustomerID and first purchase date
df_min_purchase = df.groupby('customer_unique_id').order_purchase_timestamp.min().reset_index()
df_min_purchase.columns = ['customer_unique_id','minpurchasedate']
df_min_purchase['minpurchasedate'] = df_min_purchase['minpurchasedate'].map(lambda date: 100*date.year + date.month)

#merge first purchase date column to our main dataframe (tx_uk)
df = pd.merge(df, df_min_purchase, on='customer_unique_id')
In [119]:
#create a column called User Type and assign Existing 
#if User's First Purchase Year Month before the selected Invoice Year Month
df['usertype'] = 'New'
df.loc[df['month_y']>df['minpurchasedate'],'usertype'] = 'Existing'

#calculate the Revenue per month for each user type
df_user_type_revenue = df.groupby(['month_y','usertype', 'month_year'])['payment_value'].sum().reset_index()

df_user_type_revenue
Out[119]:
month_y usertype month_year payment_value
0 201701 New 2017-01 176376.56
1 201702 Existing 2017-02 111.07
2 201702 New 2017-02 323704.88
3 201703 Existing 2017-03 596.38
4 201703 New 2017-03 505139.45
5 201704 Existing 2017-04 2677.76
6 201704 New 2017-04 453430.56
7 201705 Existing 2017-05 6733.95
8 201705 New 2017-05 694385.65
9 201706 Existing 2017-06 6956.06
10 201706 New 2017-06 578444.92
11 201707 Existing 2017-07 13276.36
12 201707 New 2017-07 702793.62
13 201708 Existing 2017-08 15000.05
14 201708 New 2017-08 827689.89
15 201709 Existing 2017-09 14011.16
16 201709 New 2017-09 982074.45
17 201710 Existing 2017-10 20695.65
18 201710 New 2017-10 977913.97
19 201711 Existing 2017-11 24770.55
20 201711 New 2017-11 1523777.31
21 201712 Existing 2017-12 24133.48
22 201712 New 2017-12 995933.78
23 201801 Existing 2018-01 24698.90
24 201801 New 2018-01 1349365.12
25 201802 Existing 2018-02 26661.62
26 201802 New 2018-02 1253352.92
27 201803 Existing 2018-03 34568.26
28 201803 New 2018-03 1400890.07
29 201804 Existing 2018-04 41982.07
30 201804 New 2018-04 1424625.08
31 201805 Existing 2018-05 35888.25
32 201805 New 2018-05 1444779.34
33 201806 Existing 2018-06 40794.44
34 201806 New 2018-06 1244602.34
35 201807 Existing 2018-07 33086.53
36 201807 New 2018-07 1273620.89
37 201808 Existing 2018-08 27898.55
38 201808 New 2018-08 1183341.54
In [120]:
fig, ax = plt.subplots(figsize=(15, 6))
sns.set(palette='muted', color_codes=True)
ax = sns.lineplot(x='month_year', y='payment_value', data=df_user_type_revenue.query("usertype == 'New'"), label='New')
ax = sns.lineplot(x='month_year', y='payment_value', data=df_user_type_revenue.query("usertype == 'Existing'"), label='Existing')
format_spines(ax, right_border=False)
ax.set_title('Existing vs New Customer Comparison')
ax.tick_params(axis='x', labelrotation=90)
plt.show()
In [121]:
#create a dataframe that shows new user ratio - we also need to drop NA values (first month new user ratio is 0)
df_user_ratio = df.query("usertype == 'New'").groupby(['month_year'])['customer_unique_id'].nunique()/df.query("usertype == 'Existing'").groupby(['month_year'])['customer_unique_id'].nunique() 
df_user_ratio = df_user_ratio.reset_index()

#dropping nan values that resulted from first and last month
df_user_ratio = df_user_ratio.dropna()
df_user_ratio.columns = ['month_year','NewCusRatio']

#print the dafaframe
df_user_ratio
Out[121]:
month_year NewCusRatio
1 2017-02 808.000000
2 2017-03 500.600000
3 2017-04 132.764706
4 2017-05 123.214286
5 2017-06 77.871795
6 2017-07 76.591837
7 2017-08 71.175439
8 2017-09 51.333333
9 2017-10 49.193182
10 2017-11 57.868852
11 2017-12 47.660714
12 2018-01 52.236641
13 2018-02 56.151786
14 2018-03 48.741007
15 2018-04 40.629630
16 2018-05 35.178378
17 2018-06 32.285714
18 2018-07 39.377483
19 2018-08 37.012048
In [122]:
fig, ax = plt.subplots(figsize=(12, 6))
sns.set(palette='muted', color_codes=True, style='whitegrid')
bar_plot(x='month_year', y='NewCusRatio', df=df_user_ratio, value=True)
ax.tick_params(axis='x', labelrotation=90)
plt.show()

Monthly Retention Rate

In [123]:
#Monthly Retention Rate = Retained Customers From Prev. Month/Active Customers Total (using crosstab)

#identifying active users are active by looking at their revenue per month
df_user_purchase = df.groupby(['customer_unique_id','month_y'])['payment_value'].sum().reset_index()
df_user_purchase.head()
Out[123]:
customer_unique_id month_y payment_value
0 0000366f3b9a7992bf8c76cfdf3221e2 201805 141.90
1 0000b849f77a49e4a4ce2b2a4ca5be3f 201805 27.19
2 0000f46a3911fa3c0805444483337064 201703 86.22
3 0000f6ccb0745a6a4b88665a16c9f078 201710 43.62
4 0004aac84e0df4da2b147fca70cf8255 201711 196.89
In [124]:
#identifying active users are active by looking at their order count per month
df_user_purchase = df.groupby(['customer_unique_id','month_y'])['payment_value'].count().reset_index()
df_user_purchase.head()
Out[124]:
customer_unique_id month_y payment_value
0 0000366f3b9a7992bf8c76cfdf3221e2 201805 1
1 0000b849f77a49e4a4ce2b2a4ca5be3f 201805 1
2 0000f46a3911fa3c0805444483337064 201703 1
3 0000f6ccb0745a6a4b88665a16c9f078 201710 1
4 0004aac84e0df4da2b147fca70cf8255 201711 1
In [125]:
#create retention matrix with crosstab using purchase
df_retention = pd.crosstab(df_user_purchase['customer_unique_id'], df_user_purchase['month_y']).reset_index()
df_retention.head()
Out[125]:
month_y customer_unique_id 201701 201702 201703 201704 201705 201706 201707 201708 201709 201710 201711 201712 201801 201802 201803 201804 201805 201806 201807 201808
0 0000366f3b9a7992bf8c76cfdf3221e2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
1 0000b849f77a49e4a4ce2b2a4ca5be3f 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
2 0000f46a3911fa3c0805444483337064 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 0000f6ccb0745a6a4b88665a16c9f078 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
4 0004aac84e0df4da2b147fca70cf8255 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
In [126]:
#creating an array of dictionary which keeps Retained & Total User count for each month
months = df_retention.columns[2:]
retention_array = []
for i in range(len(months)-1):
    retention_data = {}
    selected_month = months[i+1]
    prev_month = months[i]
    retention_data['month_y'] = int(selected_month)
    retention_data['TotalUserCount'] = df_retention[selected_month].sum()
    retention_data['RetainedUserCount'] = df_retention[(df_retention[selected_month]>0) & (df_retention[prev_month]>0)][selected_month].sum()
    retention_array.append(retention_data)
    
#convert the array to dataframe and calculate Retention Rate
df_retention = pd.DataFrame(retention_array)
df_retention['RetentionRate'] = df_retention['RetainedUserCount']/df_retention['TotalUserCount']

df_retention
Out[126]:
RetainedUserCount TotalUserCount month_y RetentionRate
0 3 2508 201703 0.001196
1 11 2274 201704 0.004837
2 14 3478 201705 0.004025
3 16 3076 201706 0.005202
4 16 3802 201707 0.004208
5 23 4114 201708 0.005591
6 32 4082 201709 0.007839
7 32 4417 201710 0.007245
8 37 7182 201711 0.005152
9 41 5450 201712 0.007523
10 16 6974 201801 0.002294
11 27 6401 201802 0.004218
12 23 6914 201803 0.003327
13 31 6744 201804 0.004597
14 45 6693 201805 0.006723
15 38 6058 201806 0.006273
16 26 6097 201807 0.004264
17 37 6310 201808 0.005864
In [127]:
fig, ax = plt.subplots(figsize=(12, 6))
sns.set(palette='muted', color_codes=True, style='whitegrid')
bar_plot(x='month_y', y='RetentionRate', df=df_retention, value=True)
ax.tick_params(axis='x', labelrotation=90)
plt.show()

Cohort Based Retention Rate

Cohorts are determined as first purchase year-month of the customers. I will be measuring what percentage of the customers retained after their first purchase in each month. This view will help unveil how recent and old cohorts differ regarding retention rate and if recent changes in customer experience affected new customer’s retention or not.

In [128]:
#create our retention table again with crosstab() - we need to change the column names for using them in .query() function
df_retention = pd.crosstab(df_user_purchase['customer_unique_id'], df_user_purchase['month_y']).reset_index()
new_column_names = [ 'm_' + str(column) for column in df_retention.columns]
df_retention.columns = new_column_names
In [129]:
#create the array of Retained users for each cohort monthly
retention_array = []
for i in range(len(months)):
    retention_data = {}
    selected_month = months[i]
    prev_months = months[:i]
    next_months = months[i+1:]
    for prev_month in prev_months:
        retention_data[prev_month] = np.nan
        
    total_user_count =  retention_data['TotalUserCount'] = df_retention['m_' + str(selected_month)].sum()
    retention_data[selected_month] = 1 
    
    query = "{} > 0".format('m_' + str(selected_month))
    

    for next_month in next_months:
        query = query + " and {} > 0".format(str('m_' + str(next_month)))
        retention_data[next_month] = np.round(df_retention.query(query)['m_' + str(next_month)].sum()/total_user_count,2)
    retention_array.append(retention_data)
In [130]:
#create the array of Retained users for each cohort monthly
retention_array = []
for i in range(len(months)):
    retention_data = {}
    selected_month = months[i]
    prev_months = months[:i]
    next_months = months[i+1:]
    for prev_month in prev_months:
        retention_data[prev_month] = np.nan
        
    total_user_count =  retention_data['TotalUserCount'] = df_retention['m_' + str(selected_month)].sum()
    retention_data[selected_month] = 1 
    
    query = "{} > 0".format('m_' + str(selected_month))
    

    for next_month in next_months:
        query = query + " and {} > 0".format(str('m_' + str(next_month)))
        retention_data[next_month] = np.round(df_retention.query(query)['m_' + str(next_month)].sum()/total_user_count,2)
    retention_array.append(retention_data)
    
df_retention = pd.DataFrame(retention_array)
df_retention.index = months

#showing new cohort based retention table
df_retention
Out[130]:
TotalUserCount 201702 201703 201704 201705 201706 201707 201708 201709 201710 201711 201712 201801 201802 201803 201804 201805 201806 201807 201808
month_y
201702 1618 1.0 0.0 0.0 0.00 0.0 0.00 0.00 0.00 0.00 0.00 0.00 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.00
201703 2508 NaN 1.0 0.0 0.00 0.0 0.00 0.00 0.00 0.00 0.00 0.00 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.00
201704 2274 NaN NaN 1.0 0.01 0.0 0.00 0.00 0.00 0.00 0.00 0.00 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.00
201705 3478 NaN NaN NaN 1.00 0.0 0.00 0.00 0.00 0.00 0.00 0.00 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.00
201706 3076 NaN NaN NaN NaN 1.0 0.01 0.00 0.00 0.00 0.00 0.00 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.00
201707 3802 NaN NaN NaN NaN NaN 1.00 0.01 0.00 0.00 0.00 0.00 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.00
201708 4114 NaN NaN NaN NaN NaN NaN 1.00 0.01 0.00 0.00 0.00 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.00
201709 4082 NaN NaN NaN NaN NaN NaN NaN 1.00 0.01 0.00 0.00 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.00
201710 4417 NaN NaN NaN NaN NaN NaN NaN NaN 1.00 0.01 0.00 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.00
201711 7182 NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.00 0.01 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.00
201712 5450 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.00 0.0 0.0 0.0 0.0 0.00 0.00 0.0 0.00
201801 6974 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 0.0 0.0 0.0 0.00 0.00 0.0 0.00
201802 6401 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 0.0 0.0 0.00 0.00 0.0 0.00
201803 6914 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 0.0 0.00 0.00 0.0 0.00
201804 6744 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 0.01 0.00 0.0 0.00
201805 6693 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.00 0.01 0.0 0.00
201806 6058 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.00 0.0 0.00
201807 6097 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 0.01
201808 6310 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.00

Customer Segmentation

Customers who shop on Olist have different needs and they have their own different profile. We should adapt our actions depending on that.

RFM stands for Recency - Frequency - Monetary Value, I will be using this metrics to segment customers. Theoretically we will have segments like below: Low Value: Customers who are less active than others, not very frequent buyer/visitor and generates very low - zero - maybe negative revenue. Mid Value: In the middle of everything. Often using our platform (but not as much as our High Values), fairly frequent and generates moderate revenue. High Value: The group we don’t want to lose. High Revenue, Frequency and low Inactivity.

Recency

To calculate recency, we need to find out most recent purchase date of each customer and see for how many days they are inactive. After having no. of inactive days for each customer, we will apply K-means* clustering to assign customers a recency score.

In [131]:
#creates a generic user dataframe to keep CustomerID and new segmentation scores
df_user = pd.DataFrame(df['customer_unique_id'])
df_user.columns = ['customer_unique_id']

#gets the max purchase date for each customer and create a dataframe with it
df_max_purchase = df.groupby('customer_unique_id').order_purchase_timestamp.max().reset_index()
df_max_purchase.columns = ['customer_unique_id', 'MaxPurchaseDate']

#we take our observation point as the max purchase date in our dataset
df_max_purchase['Recency'] = (df_max_purchase['MaxPurchaseDate'].max() - df_max_purchase['MaxPurchaseDate']).dt.days

#merge this dataframe to our new user dataframe
df_user = pd.merge(df_user, df_max_purchase[['customer_unique_id','Recency']], on='customer_unique_id')

df_user.head()
Out[131]:
customer_unique_id Recency
0 708ab75d2a007f0564aedd11139c7708 125
1 a8b9d3a27068454b1c98cc67d4e31e6f 64
2 6f70c0b2f7552832ba46eb57b1c5651e 260
3 87695ed086ebd36f20404c82d20fca87 266
4 4291db0da71914754618cd789aebcd56 28
In [132]:
# getting summary statistics of the recency table
df_user.Recency.describe()
Out[132]:
count    114681.000000
mean        235.937671
std         150.795167
min           0.000000
25%         113.000000
50%         217.000000
75%         344.000000
max         601.000000
Name: Recency, dtype: float64
In [133]:
# plotting the distribution of the continous feature set
sns.set(palette='muted', color_codes=True, style='white')
fig, ax = plt.subplots(figsize=(12, 6))
sns.despine(left=True)
sns.distplot(df_user['Recency'], bins=30)
plt.show()

Applying K-means clustering to assign a recency score to each customer. But there is need to tell how many clusters i need to K-means algorithm. To find this out, I will apply Elbow Method. Elbow Method simply tells the optimal cluster number for optimal inertia.

In [134]:
 from sklearn.cluster import KMeans

sse={}
df_recency = df_user[['Recency']]
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_recency)
    df_recency["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_
    
plt.figure(figsize=(10, 5))
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()
In [135]:
#building 5 clusters for recency and adding it to dataframe
kmeans = KMeans(n_clusters=5)
kmeans.fit(df_user[['Recency']])
df_user['RecencyCluster'] = kmeans.predict(df_user[['Recency']])

#function for ordering cluster numbers
def order_cluster(cluster_field_name, target_field_name,df,ascending):
    new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
    df_final = df_final.drop([cluster_field_name],axis=1)
    df_final = df_final.rename(columns={"index":cluster_field_name})
    return df_final

df_user = order_cluster('RecencyCluster', 'Recency',df_user,False)
In [136]:
#displaying the details of each cluster
df_user.groupby('RecencyCluster')['Recency'].describe()
Out[136]:
count mean std min 25% 50% 75% max
RecencyCluster
0 14624.0 503.084587 42.798837 439.0 466.0 496.0 537.0 601.0
1 18609.0 373.900425 34.084795 317.0 344.0 373.0 403.0 438.0
2 27424.0 258.589155 31.146748 205.0 230.0 264.0 280.0 316.0
3 29978.0 150.630396 30.874496 99.0 123.0 150.0 178.0 204.0
4 24046.0 47.217791 27.082977 0.0 23.0 43.0 71.0 98.0

Frequency

To create frequency clusters, i will need to find total number orders for each customer, after which i cango ahead and place them in various clusters

In [137]:
#get order counts for each user and create a dataframe with it
df_frequency = df.groupby('customer_unique_id').order_purchase_timestamp.count().reset_index()
df_frequency.columns = ['customer_unique_id','Frequency']

#add this data to our main dataframe
df_user = pd.merge(df_user, df_frequency, on='customer_unique_id')
In [138]:
# getting summary statistics of the recency table
df_user.Frequency.describe()
Out[138]:
count    114681.000000
mean          1.777496
std           2.774387
min           1.000000
25%           1.000000
50%           1.000000
75%           2.000000
max          75.000000
Name: Frequency, dtype: float64
In [139]:
# plotting the distribution of the continous feature set
sns.set(palette='muted', color_codes=True, style='whitegrid')
fig, ax = plt.subplots(figsize=(12, 6))
sns.despine(left=True)
sns.distplot(df_user['Frequency'], hist=False)
plt.show()
In [140]:
#k-means
kmeans = KMeans(n_clusters=5)
kmeans.fit(df_user[['Frequency']])
df_user['FrequencyCluster'] = kmeans.predict(df_user[['Frequency']])

#order the frequency cluster
df_user = order_cluster('FrequencyCluster', 'Frequency',df_user,True)

#see details of each cluster
df_user.groupby('FrequencyCluster')['Frequency'].describe()
Out[140]:
count mean std min 25% 50% 75% max
FrequencyCluster
0 99875.0 1.207900 0.405807 1.0 1.0 1.0 1.0 2.0
1 12870.0 4.039005 1.186489 3.0 3.0 4.0 5.0 7.0
2 1464.0 10.831967 2.252463 8.0 9.0 11.0 12.0 16.0
3 397.0 24.536524 6.099081 18.0 20.0 22.0 24.0 38.0
4 75.0 75.000000 0.000000 75.0 75.0 75.0 75.0 75.0

As the same notation as recency clusters, high frequency number indicates better customers.

Revenue

Let’s see how our customer base looks like when I cluster them based on revenue. I will calculate revenue for each customer, plot a histogram and apply the same clustering method.

In [141]:
#calculate revenue for each customer
df_revenue = df.groupby('customer_unique_id').payment_value.sum().reset_index()

#merge it with our main dataframe
df_user = pd.merge(df_user, df_revenue, on='customer_unique_id')
In [142]:
# plotting the distribution of the continous feature set
sns.set(palette='muted', color_codes=True, style='white')
fig, ax = plt.subplots(figsize=(12, 6))
sns.despine(left=True)
sns.distplot(df_user['payment_value'], hist=False)
plt.show()
In [143]:
sse={}
df_revenue = df_user[['payment_value']]
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(df_revenue)
    df_revenue["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_
    
plt.figure(figsize=(10, 5))
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show()
In [144]:
#apply clustering
kmeans = KMeans(n_clusters=6)
kmeans.fit(df_user[['payment_value']])
df_user['RevenueCluster'] = kmeans.predict(df_user[['payment_value']])


#order the cluster numbers
df_user = order_cluster('RevenueCluster', 'payment_value',df_user,True)

#show details of the dataframe
df_user.groupby('RevenueCluster')['payment_value'].describe()
Out[144]:
count mean std min 25% 50% 75% max
RevenueCluster
0 108158.0 192.457586 190.876380 9.59 67.53 124.205 233.3375 1031.15
1 5542.0 1878.489928 816.514574 1031.69 1252.08 1602.250 2216.8350 4415.96
2 743.0 7000.283419 2137.860492 4447.80 5289.12 6317.220 7971.8800 12490.88
3 184.0 20278.110435 5207.417793 14196.28 16313.60 19174.380 25051.8900 30186.00
4 46.0 43587.292174 2837.177072 36489.24 44048.00 44048.000 45256.0000 45256.00
5 8.0 109312.640000 0.000000 109312.64 109312.64 109312.640 109312.6400 109312.64

Overall Score

After creating various metric scores (cluster numbers) for recency, frequency & revenue. now i will proceed to create an overall score out of them:

In [145]:
#renaming columns
df_user.columns = ['customer_unique_id', 'Recency', 'RecencyCluster', 'Frequency', 'FrequencyCluster', 'Monetary', 'RevenueCluster']
#calculate overall score and use mean() to see details
df_user['OverallScore'] = df_user['RecencyCluster'] + df_user['FrequencyCluster'] + df_user['RevenueCluster']
df_user.groupby('OverallScore')['Recency','Frequency','Monetary'].mean()
Out[145]:
Recency Frequency Monetary
OverallScore
0 502.486153 1.207470 166.043273
1 384.083767 1.376805 199.495597
2 270.960344 1.468850 221.659299
3 167.369310 1.584444 255.073696
4 68.998195 1.703787 310.956322
5 103.572973 6.029730 1236.708176
6 90.805680 6.739910 3053.829791
7 114.172260 11.863535 6191.532438
8 156.974843 12.257862 24128.473333
10 96.454545 22.181818 35259.341818
In [146]:
df_user['Segment'] = 'Low-Value'
df_user.loc[df_user['OverallScore']>3,'Segment'] = 'Mid-Value' 
df_user.loc[df_user['OverallScore']>6,'Segment'] = 'High-Value' 
In [147]:
df_user.head()
Out[147]:
customer_unique_id Recency RecencyCluster Frequency FrequencyCluster Monetary RevenueCluster OverallScore Segment
0 708ab75d2a007f0564aedd11139c7708 125 3 1 0 99.33 0 3 Low-Value
1 6e3c218d5f0434ddc4af3d6a60767bbf 114 3 2 0 192.24 0 3 Low-Value
2 6e3c218d5f0434ddc4af3d6a60767bbf 114 3 2 0 192.24 0 3 Low-Value
3 0ae522661311f598df20272643d39ce6 160 3 1 0 157.45 0 3 Low-Value
4 d386a136dc889cf681443061874caad8 113 3 1 0 136.71 0 3 Low-Value
In [148]:
# plotting the distribution of the continous feature set
sns.set(palette='muted', color_codes=True, style='whitegrid')
fig, axs = plt.subplots(1, 3, figsize=(22, 5))
sns.despine(left=True)
sns.scatterplot(x='Recency', y='Frequency', ax=axs[0], hue='Segment', data=df_user, size='Segment', sizes=(50,150), size_order=['High-Value','Mid-Value','Low-Value'])
sns.scatterplot(x='Frequency', y='Monetary', ax=axs[1], hue='Segment', data=df_user, size='Segment' , sizes=(50,150), size_order=['High-Value','Mid-Value','Low-Value'])
sns.scatterplot(x='Recency', y='Monetary', ax=axs[2], hue='Segment', data=df_user, size='Segment' , sizes=(50,150), size_order=['High-Value','Mid-Value','Low-Value'])
axs[0].set_title('Customer Segments by Recency & Frequency')
axs[1].set_title('Customer Segments by Frequency & Monetary')
axs[2].set_title('Customer Segments by Recency & Monetary')
plt.show()
In [2]:
pip install rpy2
Requirement already satisfied: rpy2 in d:\anaconda3\lib\site-packages (3.3.6)
Requirement already satisfied: tzlocal in d:\anaconda3\lib\site-packages (from rpy2) (2.1)
Requirement already satisfied: cffi>=1.10.0 in d:\anaconda3\lib\site-packages (from rpy2) (1.12.3)
Requirement already satisfied: pytest in d:\anaconda3\lib\site-packages (from rpy2) (5.0.1)
Requirement already satisfied: pytz in d:\anaconda3\lib\site-packages (from rpy2) (2019.1)
Requirement already satisfied: jinja2 in d:\anaconda3\lib\site-packages (from rpy2) (2.10.1)
Requirement already satisfied: pycparser in d:\anaconda3\lib\site-packages (from cffi>=1.10.0->rpy2) (2.19)
Requirement already satisfied: py>=1.5.0 in d:\anaconda3\lib\site-packages (from pytest->rpy2) (1.8.0)
Requirement already satisfied: packaging in d:\anaconda3\lib\site-packages (from pytest->rpy2) (19.0)
Requirement already satisfied: attrs>=17.4.0 in d:\anaconda3\lib\site-packages (from pytest->rpy2) (19.1.0)
Requirement already satisfied: more-itertools>=4.0.0 in d:\anaconda3\lib\site-packages (from pytest->rpy2) (7.0.0)
Requirement already satisfied: atomicwrites>=1.0 in d:\anaconda3\lib\site-packages (from pytest->rpy2) (1.3.0)
Requirement already satisfied: pluggy<1.0,>=0.12 in d:\anaconda3\lib\site-packages (from pytest->rpy2) (0.12.0)
Requirement already satisfied: importlib-metadata>=0.12 in d:\anaconda3\lib\site-packages (from pytest->rpy2) (1.6.0)
Requirement already satisfied: wcwidth in d:\anaconda3\lib\site-packages (from pytest->rpy2) (0.1.7)
Requirement already satisfied: colorama in d:\anaconda3\lib\site-packages (from pytest->rpy2) (0.4.1)
Requirement already satisfied: MarkupSafe>=0.23 in d:\anaconda3\lib\site-packages (from jinja2->rpy2) (1.1.1)
Requirement already satisfied: six in d:\anaconda3\lib\site-packages (from packaging->pytest->rpy2) (1.12.0)
Requirement already satisfied: pyparsing>=2.0.2 in d:\anaconda3\lib\site-packages (from packaging->pytest->rpy2) (2.4.0)
Requirement already satisfied: zipp>=0.5 in d:\anaconda3\lib\site-packages (from importlib-metadata>=0.12->pytest->rpy2) (0.5.1)
Note: you may need to restart the kernel to use updated packages.
In [4]:
%%R
---------------------------------------------------------------------------
OSError                                   Traceback (most recent call last)
<ipython-input-4-1289916e008b> in <module>
      3 os.environ["PATH"]   = r"D:\Install\R\R-3.6.1\bin\x64" + ";" + os.environ["PATH"]
      4 import rpy2
----> 5 from rpy2.robjects import pandas2ri, packages
      6 pandas2ri.activate()
      7 stats = packages.importr('stats')

D:\Anaconda3\lib\site-packages\rpy2\robjects\__init__.py in <module>
     14 from functools import partial
     15 import types
---> 16 import rpy2.rinterface as rinterface
     17 import rpy2.rlike.container as rlc
     18 

D:\Anaconda3\lib\site-packages\rpy2\rinterface.py in <module>
      4 import typing
      5 from typing import Union
----> 6 from rpy2.rinterface_lib import openrlib
      7 import rpy2.rinterface_lib._rinterface_capi as _rinterface
      8 import rpy2.rinterface_lib.embedded as embedded

D:\Anaconda3\lib\site-packages\rpy2\rinterface_lib\openrlib.py in <module>
     42     rlib = _rinterface_cffi.lib
     43 else:
---> 44     rlib = _dlopen_rlib(R_HOME)
     45 
     46 

D:\Anaconda3\lib\site-packages\rpy2\rinterface_lib\openrlib.py in _dlopen_rlib(r_home)
     35         raise ValueError('The library path cannot be None.')
     36     else:
---> 37         rlib = ffi.dlopen(lib_path)
     38     return rlib
     39 

OSError: cannot load library 'D:\Install\R\R-3.6.1\bin\x64\R.dll': error 0x7e